library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(dplyr)
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
# Load dataset
nyt_bestseller <- read.csv("NYT_best_seller .csv", stringsAsFactors = FALSE)
# Filter for fiction books (assuming fiction-related categories contain 'Fiction' in 'list_name')
fiction_df <- subset(nyt_bestseller, grepl("Fiction", list_name, ignore.case = TRUE))
# Convert 'bestsellers_date' and 'published_date' to Date format
fiction_df$bestsellers_date <- as.Date(fiction_df$bestsellers_date, format="%m/%d/%y")
fiction_df$published_date <- as.Date(fiction_df$published_date, format="%m/%d/%y")
# Filter for books published between 2010 and 2016
fiction_df <- subset(fiction_df, format(published_date, "%Y") >= 2010 & format(published_date, "%Y") <= 2016)
# Calculate time difference in days between publication and best seller date
fiction_df$days_to_best_seller <- as.numeric(difftime(fiction_df$bestsellers_date, fiction_df$published_date, units="days"))
# Create improved scatter plot with trend line
ggplot(fiction_df, aes(x = published_date, y = days_to_best_seller)) +
geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) + # Add color gradient based on days
geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) + # Add trend line (LOESS)
scale_color_gradient(low = "blue", high = "red") + # Gradient from blue (low) to red (high)
scale_x_date(labels = date_format("%Y"), breaks = "1 year") + # Format x-axis to show years
labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
x = "Publication Date",
y = "Days to Become Bestseller",
color = "Days to Bestseller") +
theme_minimal(base_size = 14) + # Clean theme with larger text
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
# Load necessary libraries
library(ggplot2)
library(scales) # For formatting dates
# Convert 'bestsellers_date' and 'published_date' to Date format
nyt_bestseller$bestsellers_date <- as.Date(nyt_bestseller$bestsellers_date, format="%m/%d/%y")
nyt_bestseller$published_date <- as.Date(nyt_bestseller$published_date, format="%m/%d/%y")
# Filter for books published between 2010 and 2016
nyt_bestseller <- subset(nyt_bestseller, format(published_date, "%Y") >= 2010 & format(published_date, "%Y") <= 2016)
# Calculate time difference in days between publication and best seller date
nyt_bestseller$days_to_best_seller <- as.numeric(difftime(nyt_bestseller$bestsellers_date, nyt_bestseller$published_date, units="days"))
# Create improved scatter plot with trend line
ggplot(nyt_bestseller, aes(x = published_date, y = days_to_best_seller)) +
geom_point(aes(color = days_to_best_seller), alpha = 0.6, size = 2) + # Add color gradient based on days
geom_smooth(method = "loess", se = TRUE, color = "red", size = 1) +
scale_color_gradient(low = "blue", high = "red") + # Gradient from blue (low) to red (high)
scale_x_date(labels = date_format("%Y"), breaks = "1 year") + # Format x-axis to show years
labs(title = "Days to Bestseller vs. Publication Date (2010-2016)",
x = "Publication Date",
y = "Days to Become Bestseller",
color = "Days to Bestseller") +
theme_minimal(base_size = 14) + # Clean theme with larger text
theme(axis.text.x = element_text(angle = 45, hjust = 1)) # Rotate x-axis labels for readability
## `geom_smooth()` using formula = 'y ~ x'
library(dplyr)
print(nyt_bestseller$days_to_best_seller)
## [1] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [19] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [37] -14 -14 -14 -14 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [55] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [73] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [91] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [109] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [127] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [145] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [163] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [181] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [199] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [217] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [235] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -14 -14
## [253] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [271] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [289] -14 -14 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [307] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [325] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [343] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [361] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [379] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [397] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [415] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [433] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [451] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [469] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [487] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -14 -14 -14 -14
## [505] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [523] -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14 -14
## [541] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [559] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [577] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [595] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [613] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [631] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [649] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [667] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [685] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [703] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [721] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [739] -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15 -15
## [757] -15 -15 -15 -15
# Filter for 2010-2016
# Load required libraries
library(dplyr)
library(readr)
# Read the dataset
df <- read_csv("NYT_best_seller_08_16.csv", show_col_types = FALSE)
# Convert published_date to Date format (correcting MM/DD/YY format)
df$published_date <- as.Date(df$published_date, format="%m/%d/%y")
# Check date range before filtering
print(range(df$published_date, na.rm = TRUE))
## [1] "2008-06-08" "2016-06-12"
# Filter data to only include books published between 2010 and 2016
df_filtered <- df %>%
filter(published_date >= as.Date("2010-01-01") & published_date <= as.Date("2016-12-31"))
# Count the number of books per publisher
publisher_counts_2010_2016 <- df_filtered %>%
count(publisher, name = "n") %>%
arrange(desc(n))
# Ensure data exists before saving
print(dim(publisher_counts_2010_2016))
## [1] 115 2
print(head(publisher_counts_2010_2016))
## # A tibble: 6 × 2
## publisher n
## <chr> <int>
## 1 Grand Central 63
## 2 Bantam 54
## 3 Berkley 40
## 4 Vintage 28
## 5 Ballantine 27
## 6 Putnam 26
# Display filtered dataset
print(publisher_counts_2010_2016)
## # A tibble: 115 × 2
## publisher n
## <chr> <int>
## 1 Grand Central 63
## 2 Bantam 54
## 3 Berkley 40
## 4 Vintage 28
## 5 Ballantine 27
## 6 Putnam 26
## 7 Dell 25
## 8 Little, Brown 24
## 9 Simon & Schuster 20
## 10 Knopf 19
## # ℹ 105 more rows
ggplot(publisher_counts_2010_2016, aes(x = reorder(publisher, n), y = n)) +
geom_bar(stat = "identity", fill = "skyblue") +
coord_flip() +
labs(title = "Publisher Counts in NYT Best Sellers",
subtitle = "Data from 2010-2016",
x = "Publisher",
y = "Number of Best Sellers") +
theme_minimal()
top_publishers <- publisher_counts_2010_2016 %>%
arrange(desc(n)) %>%
head(10) # Keep only the top 10 publishers
# Create the ggplot visualization
ggplot(top_publishers, aes(x = reorder(publisher, n), y = n)) +
geom_bar(stat = "identity", fill = "red") +
coord_flip() +
labs(title = "Top 10 Publishers in NYT Best Sellers",
subtitle = "Data from 2010-2016",
x = "Publisher",
y = "Number of Best Sellers") +
theme_minimal()
# Load necessary libraries
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)
# Convert published_date to Date format
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
# Extract month name from published_date
data$month <- format(data$published_date, "%B")
# Order months
data$month <- factor(data$month, levels = month.name, ordered = TRUE)
# Count number of books published per month
monthly_counts <- data %>%
group_by(month) %>%
summarise(count = n())
# Create bar chart
ggplot(monthly_counts, aes(x = month, y = count, fill = month)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(title = "Frequency of Bestseller Books Released Per Month",
x = "Month",
y = "Number of Bestsellers") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
updated_categories <- read.csv("Updated_Bestsellers_Data_Cleaned.csv")
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the updated dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)
# Convert the published_date column to Date format and extract the month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B") # Extract full month name
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Create the visualization
ggplot(data, aes(x = month, fill = New_Category)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribution of Bestsellers by Month",
x = "Month",
y = "Count of Bestsellers",
fill = "Category") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Count occurrences of each category per month
heatmap_data <- data %>%
count(month, New_Category)
# Create heatmap
ggplot(heatmap_data, aes(x = month, y = New_Category, fill = n)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal() +
labs(title = "Heatmap of Bestsellers by Month and Category",
x = "Month",
y = "Category",
fill = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
library(ggplot2)
library(dplyr)
library(lubridate)
# Load the dataset
data <- read.csv("Updated_Bestsellers_Data_Cleaned.csv", stringsAsFactors = FALSE)
# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Count occurrences of each category per month
area_chart_data <- data %>%
count(month, New_Category)
# Create stacked area chart
ggplot(area_chart_data, aes(x = month, y = n, fill = New_Category, group = New_Category)) +
geom_area(position = "stack", alpha = 0.7) +
theme_minimal() +
labs(title = "Stacked Area Chart of Bestsellers by Month",
x = "Month",
y = "Count of Bestsellers",
fill = "Category") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Per month
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)
# Convert published_date to Date format
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
# Extract month name from published_date
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name, ordered = TRUE)
# Count number of books published per month
monthly_counts <- data %>%
group_by(month) %>%
summarise(count = n())
# Create bar chart
ggplot(monthly_counts, aes(x = month, y = count, fill = month)) +
geom_bar(stat = "identity", show.legend = FALSE) +
labs(title = "Frequency of Bestseller Books Released Per Month",
x = "Month",
y = "Number of Bestsellers") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)
# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Count occurrences of each category per month
area_chart_data <- data %>%
count(month, New_Category)
# Create stacked area chart
ggplot(area_chart_data, aes(x = month, y = n, fill = New_Category, group = New_Category)) +
geom_area(position = "stack", alpha = 0.7) +
theme_minimal() +
labs(title = "Stacked Area Chart of Bestsellers by Month",
x = "Month",
y = "Count of Bestsellers",
fill = "Category") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Heat map of Bestsellers by month and category with clean data
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)
# Convert date column and extract month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Count occurrences of each category per month
heatmap_data <- data %>%
count(month, New_Category)
# Create heatmap
ggplot(heatmap_data, aes(x = month, y = New_Category, fill = n)) +
geom_tile(color = "white") +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal() +
labs(title = "Heatmap of Bestsellers by Month and Category",
x = "Month",
y = "Category",
fill = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
data <- read.csv("dataclean_updated.csv", stringsAsFactors = FALSE)
# Convert the published_date column to Date format and extract the month
data$published_date <- as.Date(data$published_date, format = "%m/%d/%y")
data$month <- format(data$published_date, "%B")
# Order months correctly
data$month <- factor(data$month, levels = month.name)
# Create the visualization
ggplot(data, aes(x = month, fill = New_Category)) +
geom_bar() +
theme_minimal() +
labs(title = "Distribution of Bestsellers by Month",
x = "Month",
y = "Count of Bestsellers",
fill = "Category") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))